from typing import Dict
from enum import Enum
from pydantic import BaseModel

from open_rag_eval.models.llm_judges import LLMJudgeModel
from open_rag_eval.metrics.base_metrics import AugmentedGenerationMetric
from open_rag_eval.data_classes.rag_results import AugmentedGenerationResult


class QueryAnsweredValues(str, Enum):
    YES = "yes"
    NO = "no"


class QueryAnswered(BaseModel):
    answered: QueryAnsweredValues


class NoAnswerMetric(AugmentedGenerationMetric):
    """
    This metric uses LLM as a judge to determine if the generated answer contains an attempt at answering the query or if it is
    a no-answer response. It does not check if the answer is correct or incorrect, but only if it is an attempt to answer the query.
    """

    _ANSWERED_PROMPT = """
        In this task, you will evaluate the answer generated by a
        student for a given question and determine if
        the answer is an attempt to answer the question or not.

        You will be provided a query and its corresponding answer.
        If the answer is an attempt to answer the query, respond with "yes".
        If the answer is not an attempt to answer the query, respond with "no".

        Do NOT consider whether the answer is correct or incorrect.
        Do NOT consider whether the answer is fluent or well-formed.
        Do NOT consider whether the answer is relevant to the query.
        All you have to care about is whether the answer is an attempt to answer the query
        or just saying it cant answer or there isnt enough information to answer the query.

        Please provide your response based on the information in the
        query and answer. If you are unsure, use your best judgment. Respond as
        either ``yes``, ``no`` with no additional information.

        Here are some examples:

        Query: Is the sky blue?
        Answer: Yes, the sky is blue.
        Your Response: yes

        Query: What is the capital of France?
        Answer: The capital of France is Beijing.
        Your Response: yes

        Query: What is a blackhole?
        Answer: No result found
        Your Response: no

        Query: How tall is mount everest?
        Answer: Not enough information to answer the question. I don't know.
        Your Response: no

        Ok now your turn to try:

        Query: {query}
        Answer: {answer}
        Your Response:
    """

    def __init__(self, model: LLMJudgeModel):
        """Initialize the NoAnswerMetric metric.

        Args:
            model (LLMJudgeModel): The model to use for the metric assessment.
        """
        self.model = model

    def compute(self, generation_result: AugmentedGenerationResult) -> Dict[str, str]:
        scores = {}

        summary_text_collection = [
            generated_answer_part.text
            for generated_answer_part in generation_result.generated_answer
        ]
        summary = " ".join(summary_text_collection)
        try:
            prompt = self._ANSWERED_PROMPT.format(
                query=generation_result.query, answer=summary
            )
            response = self.model.parse(
                prompt,
                response_format=QueryAnswered,
                model_kwargs={
                    "temperature": 0.0,
                    "seed": 42
                },
            )
            if not response.answered:
                raise ValueError(f"Failed to parse response: {response.refusal}")

            scores["query_answered"] = response.answered.value
        except Exception as e:
            raise Exception(f"Error computing NoAnswer metric: {str(e)}") from e

        return scores
